1   /*
2    * Copyright (C) 2009 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    * http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package com.google.common.xml;
18  
19  import com.google.common.annotations.Beta;
20  import com.google.common.annotations.GwtCompatible;
21  import com.google.common.escape.Escaper;
22  import com.google.common.escape.Escapers;
23  
24  /**
25   * {@code Escaper} instances suitable for strings to be included in XML
26   * attribute values and elements' text contents. When possible, avoid manual
27   * escaping by using templating systems and high-level APIs that provide
28   * autoescaping. For example, consider <a href="http://www.xom.nu/">XOM</a> or
29   * <a href="http://www.jdom.org/">JDOM</a>.
30   *
31   * <p><b>Note:</b> Currently the escapers provided by this class do not escape
32   * any characters outside the ASCII character range. Unlike HTML escaping the
33   * XML escapers will not escape non-ASCII characters to their numeric entity
34   * replacements. These XML escapers provide the minimal level of escaping to
35   * ensure that the output can be safely included in a Unicode XML document.
36   *
37   *
38   * <p>For details on the behavior of the escapers in this class, see sections
39   * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and
40   * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the
41   * XML specification.
42   *
43   * @author Alex Matevossian
44   * @author David Beaumont
45   * @since 15.0
46   */
47  @Beta
48  @GwtCompatible
49  public class XmlEscapers {
50    private XmlEscapers() {}
51  
52    private static final char MIN_ASCII_CONTROL_CHAR = 0x00;
53    private static final char MAX_ASCII_CONTROL_CHAR = 0x1F;
54  
55    // For each xxxEscaper() method, please add links to external reference pages
56    // that are considered authoritative for the behavior of that escaper.
57  
58    /**
59     * Returns an {@link Escaper} instance that escapes special characters in a
60     * string so it can safely be included in an XML document as element content.
61     * See section
62     * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the
63     * XML specification.
64     *
65     * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not
66     * safe</b> to use this escaper to escape attribute values. Use
67     * {@link #xmlContentEscaper} if the output can appear in element content or
68     * {@link #xmlAttributeEscaper} in attribute values.
69     *
70     * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control
71     * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which
72     * are not permitted in XML. For more detail see section <a
73     * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the
74     * XML specification.
75     *
76     * <p>This escaper does not escape non-ASCII characters to their numeric
77     * character references (NCR). Any non-ASCII characters appearing in the input
78     * will be preserved in the output. Specifically "\r" (carriage return) is
79     * preserved in the output, which may result in it being silently converted to
80     * "\n" when the XML is parsed.
81     *
82     * <p>This escaper does not treat surrogate pairs specially and does not
83     * perform Unicode validation on its input.
84     */
85    public static Escaper xmlContentEscaper() {
86      return XML_CONTENT_ESCAPER;
87    }
88  
89    /**
90     * Returns an {@link Escaper} instance that escapes special characters in a
91     * string so it can safely be included in XML document as an attribute value.
92     * See section
93     * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a>
94     * of the XML specification.
95     *
96     * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control
97     * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which
98     * are not permitted in XML. For more detail see section <a
99     * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the
100    * XML specification.
101    *
102    * <p>This escaper does not escape non-ASCII characters to their numeric
103    * character references (NCR). However, horizontal tab {@code '\t'}, line feed
104    * {@code '\n'} and carriage return {@code '\r'} are escaped to a
105    * corresponding NCR {@code "&#x9;"}, {@code "&#xA;"}, and {@code "&#xD;"}
106    * respectively. Any other non-ASCII characters appearing in the input will
107    * be preserved in the output.
108    *
109    * <p>This escaper does not treat surrogate pairs specially and does not
110    * perform Unicode validation on its input.
111    */
112   public static Escaper xmlAttributeEscaper() {
113     return XML_ATTRIBUTE_ESCAPER;
114   }
115 
116   private static final Escaper XML_ESCAPER;
117   private static final Escaper XML_CONTENT_ESCAPER;
118   private static final Escaper XML_ATTRIBUTE_ESCAPER;
119   static {
120     Escapers.Builder builder = Escapers.builder();
121     // The char values \uFFFE and \uFFFF are explicitly not allowed in XML
122     // (Unicode code points above \uFFFF are represented via surrogate pairs
123     // which means they are treated as pairs of safe characters).
124     builder.setSafeRange(Character.MIN_VALUE, '\uFFFD');
125     // Unsafe characters are replaced with the Unicode replacement character.
126     builder.setUnsafeReplacement("\uFFFD");
127 
128     /*
129      * Except for \n, \t, and \r, all ASCII control characters are replaced with
130      * the Unicode replacement character.
131      *
132      * Implementation note: An alternative to the following would be to make a
133      * map that simply replaces the allowed ASCII whitespace characters with
134      * themselves and to set the minimum safe character to 0x20. However this
135      * would slow down the escaping of simple strings that contain \t, \n, or
136      * \r.
137      */
138     for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) {
139       if (c != '\t' && c != '\n' && c != '\r') {
140         builder.addEscape(c, "\uFFFD");
141       }
142     }
143 
144     // Build the content escaper first and then add quote escaping for the
145     // general escaper.
146     builder.addEscape('&', "&amp;");
147     builder.addEscape('<', "&lt;");
148     builder.addEscape('>', "&gt;");
149     XML_CONTENT_ESCAPER = builder.build();
150     builder.addEscape('\'', "&apos;");
151     builder.addEscape('"', "&quot;");
152     XML_ESCAPER = builder.build();
153     builder.addEscape('\t', "&#x9;");
154     builder.addEscape('\n', "&#xA;");
155     builder.addEscape('\r', "&#xD;");
156     XML_ATTRIBUTE_ESCAPER = builder.build();
157   }
158 }